package com.diven.spark.ml.learn.feature

import org.apache.spark.sql.{DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DataTypes
import com.diven.spark.ml.learn.core.BaseTest
import com.diven.spark.ml.learn.core.BaseSpark
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession

object HashingTFTest extends BaseTest {
    
    def apply(): BaseSpark = new HashingTFTest()
    
}

class HashingTFTest extends BaseSpark{
    
    override def getDataFrame(sparkSession: SparkSession = this.getSparkSession()): DataFrame = {
         sparkSession.createDataFrame(Seq(
              (0, Array("a", "b", "c")),
              (1, Array("a", "b", "c", "d")),
              (2, Array("a", "b", "b", "c", "e")),
              (3, Array("a", "b", "c", "c", "f")),
              (4, Array("a", "a", "b", "c", "g")),
              (5, Array("a", "a", "b", "c", "h"))
         ))
         .toDF("id", "words")
     }
    
    override def execute(dataFrame: DataFrame) = {
        //特征hash
        var transform = new HashingTF()
        .setInputCol("words")             //待变换的特征
        .setOutputCol("words_hash")       //变换后的特征名称
        .setBinary(false)                 //若果设置为true，则所有非0或非空的数据会被装换为1
        .setNumFeatures(262144)           //支持最大的特征数量，默认：262144
        .transform(dataFrame)
        //show
        transform.show(10, 1000)
        transform.printSchema()
    }
    
}
